1.1 Import pakages
# pakages
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import pydotplus
from os import path
from sklearn import tree
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from scipy import spatial
from IPython.display import Image
plt.style.use("ggplot")
# let picture inline
%matplotlib inline
1.2 Set some useful constant variables
# Project workspace
WORKSPACE="D:\\Desktop\\563\\Assignment\\Project"
1.3 Load data
movies_file_path = path.join(WORKSPACE, "data", "tmdb_5000_movies.csv")
credits_file_path=path.join(WORKSPACE, "data", "tmdb_5000_credits.csv")
movies=None
Credits=None
try:
movies=pd.read_csv(movies_file_path)
credits=pd.read_csv(credits_file_path)
print("Read movies and credits successfully")
except e:
print("Error: ",e)
finally:
pass
1.4 Preview data
movies.info()
movies.head(1)
credits.info()
credits.head(1)
1.5 Data pre-Process
# data duplication
if len(movies.id.unique())==len(movies):
print("No duplication\n")
else:
print("Duplication exists\n")
# data missing
print(movies.isnull().any())
print("There are only homepage, overview, release_data, run_time columns which is not important missing data.")
print("We do not need these information, so that just ignore them\n")
# data recording error
print("Didn't find data recording error")
# merge two data sets
movies_credits=movies.merge(credits,left_on="id",right_on="movie_id",how="left")
movies_credits.info()
movies_credits.head(1)
# extract useful data from json format columns: genres, keywords, casts...
columns=['genres', 'keywords', 'production_companies', 'production_countries', 'cast', 'crew']
for col in columns:
movies_credits[col] = movies_credits[col].apply(json.loads)
useful_colums=['genres', 'production_companies', 'production_countries']
func=lambda x:[[item['name'] for item in row] for row in x]
for col in useful_colums:
movies_credits[col]=func(movies_credits[col])
# 4 stars
movies_credits['stars']=[[item['name']for item in row[:4]]for row in movies_credits['cast']]
# 10 keywords
movies_credits['keywords']=[[item['name']for item in row[:10]]for row in movies_credits['keywords']]
# process director, weiter, crew
def extract_crew_job(crew,job):
name=""
for item in crew:
if item['job']==job:
name=item['name']
break
return name
movies_credits['director'] = [extract_crew_job(crew, 'Director') for crew in movies_credits.crew]
movies_credits['writer'] = [extract_crew_job(crew, 'Writer') for crew in movies_credits.crew]
movies_credits['producer'] = [extract_crew_job(crew, 'Producer') for crew in movies_credits.crew]
# process release date
func=lambda x:x.year
movies_credits['year'] = pd.to_datetime(movies_credits['release_date']).apply(func)
1.5 Select data
movies = movies_credits[['title_x', 'genres', 'keywords', 'director', 'stars', 'writer', 'producer', 'budget', 'revenue','popularity', 'vote_average', 'vote_count', 'production_companies', 'production_countries', 'year']]
#movies.info()
movies=movies.dropna()
movies.rename(columns={'title_x': 'title'}, inplace=True)
movies.year = movies.year.astype(int)
movies.describe()
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
movies[movies.year>=2000].groupby('year').size().plot(kind='bar',color=color)
From picture above, the average movie number is around 200 and number of 2016 is less than average a lot. Therefore, we could infer the data of 2016 is not complete and we only use data from 2000 to 2015.
movies0015=movies[(movies.year >= 2000) & (movies.year < 2016) & (movies.vote_count > 40) &(movies.budget * movies.revenue * movies.popularity * movies.vote_average !=0)]
movies0015=movies0015.reset_index(drop = 'True')
2.1 Revenue Top 10
movies0015.sort_values('revenue',ascending=False)[['title','revenue','budget','genres']][:10]
2.2 Revenue, budget and rating of return(ROI) variation trend
bgt_rvn=movies0015.groupby('year')['budget','revenue'].sum()
bgt_rvn['ROI']=(bgt_rvn.revenue-bgt_rvn.budget)/bgt_rvn.budget
fig,axes=plt.subplots(2,1,figsize=(6,6))
bgt_rvn.iloc[:,:2].plot(kind='bar',ax=axes[0],title='Budget and Revenue')
axes[0].set_ylabel('Dollar')
bgt_rvn.ROI.plot(ax=axes[1],title='Evolution of ROI')
fig.tight_layout()
From 2000 to 2015, budget of movies has not chanage a lot, but the revenue has increase on an even keel.
2.3 The factors of revenue and rating
sns.heatmap(movies0015.corr(),annot=True,vmax=1,square=True,cmap="Greens")
2.4 Trend of movies genres
def countF(column):
count=dict()
for row in column:
for ele in row:
if ele in count:
count[ele]+=1
else:
count[ele]=1
return count
genres=pd.Series(countF(movies0015.genres)).sort_values()
genres_avg=genres/len(movies0015)
#print(genres_avg)
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
genres_avg.plot(kind='barh',title='Frequency of Genres',color=color)
# select top 10 genres and observe the trend in the 15 years.
genres_by_year = movies0015.groupby('year').genres.sum()
genres_count = pd.DataFrame([], index = genres_by_year.index, columns = genres.index[:10])
for g in genres_count.columns:
for y in genres_count.index:
genres_count.loc[y,g] = genres_by_year[y].count(g) / len(genres_by_year[y])
genres_count.plot(figsize = (10,6), title = 'Evolution of Movies in 10 Genres')
2.5 Revenue of different genres
movies_by_genres = pd.DataFrame(0, index = genres.index, columns = ['revenue', 'budget', 'vote'])
for i in range(len(movies0015)):
for g in movies0015.genres[i]:
movies_by_genres.loc[g, 'revenue'] += movies0015.revenue[i]
movies_by_genres.loc[g, 'budget'] += movies0015.budget[i]
movies_by_genres.loc[g, 'vote'] += movies0015.vote_average[i]
movies_by_genres = movies_by_genres.div(genres.values, axis=0)
movies_by_genres['ROI'] = (movies_by_genres.revenue - movies_by_genres.budget) / movies_by_genres.budget
fig, axes = plt.subplots(2, 1, figsize=(8, 8))
movies_by_genres.sort_values('revenue', ascending=False)[['revenue', 'budget']].plot(ax=axes[0], kind = 'bar', title='Average Revenue and Budget in Different Genres')
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
movies_by_genres.sort_values('revenue', ascending=False)['ROI'].plot(ax=axes[1], kind = 'bar', title='ROI in Different Genres',color=color)
fig.tight_layout()
2.6 The relation between revenue and director
director_avg_rvn=movies0015.groupby('director').revenue.mean()
director_avg_rvn.hist(bins=100,figsize=(8,3))
# Top 10 directors
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
director_avg_rvn.sort_values().tail(10).plot(kind='barh',title='Top 10 Directors',color=color)
director_rvn_grs=movies0015[movies0015.genres.str.contains('Science Fiction',regex=False)].groupby('director').revenue.mean()
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
director_rvn_grs.sort_values().tail(10).plot(kind='barh',title='Top Derectors of Science Fiction',color=color)
2.7 The relation between revenue and star actors
# we dont consider the voice actor now
movies_nova=movies0015[~movies0015.genres.str.contains('Animation',regex=False)]
movies_nova=movies_nova.reset_index(drop='True')
stars=pd.Series(countF(movies_nova.stars)).sort_values()
# print(stars)
movies_by_stars=pd.DataFrame(0,index=stars.index,columns=['revenue','vote'])
#
w4=[0.4,0.3,0.2,0.1]
w3=[0.4,0.35,0.25]
w2=[0.6,0.4]
w1=[1.0]
weight=[w1,w2,w3,w4]
for i in range(len(movies_nova)):
actorlist=movies_nova.stars[i][:4]
# print(actorlist)
for j in range(len(actorlist)):
movies_by_stars.loc[actorlist[j],'revenue']+=movies_nova.revenue[i]*weight[len(actorlist)-1][j]
movies_by_stars.loc[actorlist[j],'vote']+=movies_nova.vote_average[i]
movies_by_stars=movies_by_stars.div(stars.values,axis=0)
movies_by_stars.revenue.hist(bins=100)
# Top 10 start actors
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
movies_by_stars.revenue.sort_values().tail(10).plot(kind='barh',title='Top 10 Start Actors',color=color)
2.8 The relation between revenue and release date
movies0015['month']=pd.to_datetime(movies_credits['release_date']).apply(lambda x:x.month)
movies0015['day']=pd.to_datetime(movies_credits['release_date']).apply(lambda x:x.day)
movies0015.month.hist()
# Each day od December
movies0015[movies0015.month==12].day.hist()
# Revenue and date
rvn_month=movies0015.groupby('month').revenue.sum()/movies0015.groupby('month').size()
color=[x['color'] for x in list(plt.rcParams['axes.prop_cycle'])]
rvn_month.plot(kind='bar',title='Average Revenue per Month',color=color)
3.1 Select feature sets
# Create discrete feature sets
def discrete(kw1,kw2):
binary=[]
for kw in kw1.index:
if kw in kw2:
binary.append(1)
else:
binary.append(0)
return binary
movies0015['genres_bin']=[discrete(genres,x) for x in movies0015.genres]
directors=movies0015.groupby('director').size().sort_values(ascending=False)
movies0015['director_bin']=[discrete(directors,x) for x in movies0015.director]
stars=pd.Series(countF(movies0015.stars)).sort_values(ascending=False)
movies0015['stars_bin']=[discrete(stars,x) for x in movies0015.stars]
selected_data=movies0015[['genres_bin','director_bin','stars_bin','budget','revenue']]
# selected_data=movies0015[['genres','director','stars','revenue']]
# split data for train and test
#movies0015.info()
selected_data.info()
X_train, X_test, y_train, y_test = train_test_split(selected_data.iloc[:,:4], selected_data.iloc[:,4], test_size=0.2,random_state=0)
y_train=y_train.to_frame()
y_test=y_test.to_frame()
print("X_train: ",type(X_train),X_train.shape)
print("y_train: ",type(y_train),y_train.shape)
print("X_test: ",type(X_test),X_test.shape)
print("y_test: ",type(y_train),y_test.shape)
# print(X_train)
X_train.info()
y_train.info()
# X_train.head(1)
# y_train.head(1)
# predict
def accuracy(prediction,truth):
correct=0
# print(type(prediction),prediction)
# print(type(truth),truth)
prediction=prediction.tolist()
truth=truth['revenue'].values.tolist()
for p,t in zip(prediction,truth):
# print(type(p))
# print(type(t))
if p>=0.9*t and p<=1.1*t:
correct+=1
return correct/len(truth)
# Dont use regression tree because sklearn does not support features with
# categorical attributes form.
def onehot2index(onehot):
# print(onehot)
index=onehot.index(1)
return index
X_train[['genres_bin','director_bin','stars_bin']]=X_train[['genres_bin','director_bin','stars_bin']].applymap(onehot2index)
X_test[['genres_bin','director_bin','stars_bin']]=X_test[['genres_bin','director_bin','stars_bin']].applymap(onehot2index)
# X_train2.applymap(onehot2index)
reg_tree=tree.DecisionTreeRegressor()
reg_tree=reg_tree.fit(X_train,y_train)
acc_train=accuracy(reg_tree.predict(X_train),y_train)
acc_test=accuracy(reg_tree.predict(X_test),y_test)
print("Accuracy of train = {:.3f}".format(acc_train))
print("Accuracy of test = {:.3f}".format(acc_test))
dot_data=tree.export_graphviz(reg_tree,out_file=None,
filled=True, rounded=True,
special_characters=True)
graph=pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
We can see from the result that the model got high accuracy on train data set and very low accuracy in test data sets. This problem may be caused by categorical attribute which can not be correctly splited like numeric feature.
We need pruning to avoid overfiting problem.
#limit max_depth to do pruning
reg_tree=tree.DecisionTreeRegressor(max_depth=5)
reg_tree=reg_tree.fit(X_train,y_train)
acc_train=accuracy(reg_tree.predict(X_train),y_train)
acc_test=accuracy(reg_tree.predict(X_test),y_test)
print("Accuracy of train = {:.3f}".format(acc_train))
print("Accuracy of test = {:.3f}".format(acc_test))
dot_data=tree.export_graphviz(reg_tree,out_file=None,
filled=True, rounded=True,
special_characters=True)
graph=pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
Try to use MLP
# MLP
mlp=MLPRegressor(max_iter=1000)
mlp.fit(X_train,y_train)
acc_train=accuracy(mlp.predict(X_train),y_train)
acc_test=accuracy(mlp.predict(X_test),y_test)
print("Accuracy of train = {:.3f}".format(acc_train))
print("Accuracy of test = {:.3f}".format(acc_test))
Instead of use regression tree, we try to use a kind a method which is based on similiarity.
def distance(movie1,movie2):
dis=0
lst=[[movie1.genres_bin,movie2.genres_bin],
[movie1.director_bin,movie2.director_bin],
[movie1.stars_bin,movie2.stars_bin]
]
for f1,f2 in lst:
if (i not in f1) or (1 not in f2):
d=1
else:
d=spatial.distance.cosine(f1,f2)
dis+=d
return dis
def predictor(new_movie):
mv=pd.Series()
mv['genres_bin']=new_movie['genres_bin']
mv['director_bin']=new_movie['director_bin']
mv['stars_bin']=new_movie['stars_bin']
mv['budget']=new_movie['budget']
p=selected_data.copy()
p['distance']=[distance(p.iloc[i],mv) for i in range(len(p))]
p=p.sort_values('distance')
p_avg=np.mean(p.revenue[0:5]/p.budget[0:5])*new_movie['budget']
return p_avg
def accuracy2(movies,truth):
n=len(movies)
correct=0
for index,row in movies.iterrows():
m={'genres_bin':row['genres_bin'],
'director_bin':row['director_bin'],
'stars_bin':row['stars_bin'],
'budget':row['budget']
}
# print(m)
p=predictor(m)
if p>=0.1*truth[index] and p<=10*truth[index]:
correct+=1
# print(p," / ",truth[index]," = ",p/truth[index])
return correct/n
total_train=0
total_test=0
k=1 # have test k=3, the result has been recorded in report
for i in range(k):
X_train, X_test, y_train, y_test = train_test_split(selected_data.iloc[:,:4], selected_data.iloc[:,4], test_size=0.2,random_state=1)
acc2_train=accuracy2(X_train,y_train)
# print(acc2_train)
acc2_test=accuracy2(X_test,y_test)
print("Iteration: ",i)
total_train+=acc2_train
total_test+=acc2_test
print("Accuracy of train = {:.3f}".format(acc2_train))
print("Accuracy of test = {:.3f}".format(acc2_test))
print("Average accuracy of train = {:.3f}".format(total_train/k))
print("Average accuracy of test = {:.3f}".format(total_test/k))